
##################################################################
##### Analysis with unsupervised techniques, EU speeches  ########
##################################################################

# OVERVIEW, what we do here:
# I. Load data
# II. After preparing the meta data and doing some robustness tests of cleaning procedures in other script ("Analysis_stm_preprobust") 
# we clean here the data and test a selection of models with different Ks before we determine the most suitable K for topic interpretation

# clean environment
rm(list = ls())

# Set working directory and put the "Data" folder and all files in "RData_for_directory" here!
setwd("")

# load data
library(dplyr)
library(tidyverse)
library(tidytext)
library(quanteda)
library(stm)
library(stringr)
library(Rtsne)
library(rsvd)
library(geometry)
library(igraph)
library(stmCorrViz)
library(beepr)


# load the sorted supercorpus (as per dictionary results, prepared in precedent script)
load("sorted_supercorpus.RData")


#########################################

###### I. Preprocessing, Cleaning ######

#########################################


# for unsupervised techniques such as LDA and STM
# preprocessing is crucial and depending on the measurements chosen, can influence results significantly
# this is why we applied Denny and Spirling's (2018) method to evaluate our preprocessing in the precedent script
# before we do the cleaing here

########################################
########################################

# turn into dfm and make standard cleaning procedures: punctuation, numbers, lowercase, stemming, stopwords

alldfm <- dfm(supercorpus,
              remove_punct = TRUE,
              remove_numbers = TRUE,
              tolower = TRUE,
              stem = TRUE,
              remove = stopwords("english"))

# trim dfm (common practise to get rid of terms which are disturbing/not highly frequent in corpus)
# we make this rather strict to get better model outcome (less rubbish in the topics)
# keep only words occurring >= 20 times and in >= 2 documents
alldfm <- dfm_trim(alldfm, min_termfreq = 20, min_docfreq = 2)

# IMPORTANT: remove other disturbing things (foreign words with non-English letters, etc.) which appear too frequent in the supercorpus
# for this, we compile a csv file with the 3000 most frequent words in one column 
# and manually mark in second column wether this is a disturbing/irrelevant term which needs to be removed (1)
# since their frequent but nonrelevant occurence could possibly bias the topics of the unsupervised models 
# (e.g. country names of included cases, the names of the speakers, 
# foreign words and letters, and other nonsense)

# Careful, DO NOT USE THIS CODE, it only served the purpose to compile the list,
# using it again will overwrite the manual coding!!!
# store most frequent 1000 features as dataframe
#x <- topfeatures(alldfm, 1000)
#y <- names(x)
#remove_list <- as.data.frame(y)
#write.csv(remove_list, file = "remove_list.csv", row.names = FALSE)


# load list and select only those coded with 1 as words to be removed
ignorewords <- read.csv("remove_list.csv", header=FALSE,sep=";", stringsAsFactors = FALSE)
ignorewords$V2[is.na(ignorewords$V2)] <- 0
ignorewords <- ignorewords[-which(ignorewords[,2]!=1),-2] 
ignorewords

# we filtered out further "rubbish" words/letters (identified during test runs of stm):
# we saved this selection of words as 'cleaning' and load it here again
load("cleaning.RData")
# both the remove list and the cleaning are the terms to be ignored:
ignorewords <- c(ignorewords, cleaning)
# remove them from dfm
alldfm <- dfm_remove(alldfm, ignorewords)

# test by showing 100 most frequent terms in dfm
topfeatures(alldfm, n = 100, decreasing = TRUE, scheme = c("count",
                                                           "docfreq"), groups = NULL)



############################################

#### II. Apply unsupervised techniques ####

############################################


############################################
##### STM: Structural Topic Model ##########
############################################

# what we do here:
# FIRST: get statistically optimal number of topics
# SECOND: do detailed robustness tests, check several models with alternative numbers of topics 
# (continue analysis and visualization of results of best stm model in next script)

#########################------------------###################################
#########################------------------###################################

# load meta data
load("meta_stm.RData")
# generally, the stm package allows two ways of entering meta data: topical prevalence and topical content:

# PREVALENCE: allow the observed metadata to affect the frequency with which a topic is discussed
# for us this means, using ~speaker as prevalence covariate controls for the varying size of subcorpii
# (wordfrequency scores are normalized based on relative size of subcorpus)

# CONTENT: allow the observed metadata to affect how a particular topic is discussed 
# for us this means that we compare how liberal and illiberal speakers discuss a certain topic


#########################-----------###############################
#########################-----------###############################

# before we start, we convert quanteda's dfm to stm object
# this is crucial because otherwise the algorithm does this for each substep again and again
# and that can take a lot of time
stm_obj <- convert(alldfm, to = c("stm"), docvars = NULL)
# NOTE: we don't need docvars here, because we work with the above compiled meta data frame
# export stm object for main stm script
#save(stm_obj, file = "stm_object.RData")

# now, put number of topics to 0 for statistically optimized number of topics, here without meta data:
#stat_model <- stm(stm_obj$documents, vocab = stm_obj$vocab, K= 0, 
#                  prevalence = ~speaker, data = meta, 
#                  verbose = TRUE, max.em.its = 100, init.type = "Spectral")
# 87, too many for topic interpretation...


#########################----------################################
#########################----------################################

# see an earlier version of this script (available upon request)
# which tested different topics with 5<K<30
# based on this, we concluded that a K of something between 10 and 18 seems best to interpret

#################################################################

# II. Compile a selection of models (most robust) for validation

#################################################################

# as earlier tests have shown, something between 10, 12,14,16 and 18 
# seems the best amount of topics for a valid/intuitively interpretable stm model

# compile all 5 models here with same settings, only different Ks, and write script for each of them 
# compile nice LaTeX table which shows that the major difference between this narrowed down selection of models
# is that if K is increased, there are more subtopics, if K is decreased, these subtopics are summarized to one!

# settings for all models:
# prevalence = ~ speaker (to control for varying size of subcorpii)
# init.type = "spectral" (as recommended by Roberts et al.)
# max.em.its = 150 (should be enough)
# content = not needed here

# 10 topics
stm_obj_10 <- stm(stm_obj$documents, vocab = stm_obj$vocab, K= 10, 
                  prevalence = ~speaker, data = meta, 
                  verbose = TRUE, max.em.its = 150, init.type = "Spectral")
save(stm_obj_10, file = "stm_obj_10.RData")

# 12 topics
stm_obj_12 <- stm(stm_obj$documents, vocab = stm_obj$vocab, K= 12, 
                  prevalence = ~speaker, data = meta, 
                  verbose = TRUE, max.em.its = 150, init.type = "Spectral")
save(stm_obj_12, file = "stm_obj_12.RData")


# 14 topics
stm_obj_14 <- stm(stm_obj$documents, vocab = stm_obj$vocab, K= 14, 
                  prevalence = ~speaker, data = meta, 
                  verbose = TRUE, max.em.its = 150, init.type = "Spectral")
save(stm_obj_14, file = "stm_obj_14.RData")


# 16 topics
stm_obj_16 <- stm(stm_obj$documents, vocab = stm_obj$vocab, K= 16, 
                  prevalence = ~speaker, data = meta, 
                  verbose = TRUE, max.em.its = 150, init.type = "Spectral")
save(stm_obj_16, file = "stm_obj_16.RData")


# 18 topics
stm_obj_18 <- stm(stm_obj$documents, vocab = stm_obj$vocab, K= 18, 
                  prevalence = ~speaker, data = meta, 
                  verbose = TRUE, max.em.its = 150, init.type = "Spectral")
save(stm_obj_18, file = "stm_obj_18.RData")
beep(5)


# continue with separate scripts to analyze each of these models in more detail (available upon request) and show most valid topic selection of stm_14
# continue with script "4Analysis_stm14"

